
import os
import pandas as pd
import numpy as np
from flask import jsonify
import seaborn as sns
import matplotlib.pyplot as plt
#To visualize the whole grid
pd.options.display.max_columns = 999
# Read both train and test data
train = pd.read_csv("../../data/train.csv")
test = pd.read_csv("../../data/test.csv")
# Check for missing values in the train and test file
null_values_train = train.isnull().sum().sort_values(ascending = False)
null_values_train
null_values_test = test.isnull().sum().sort_values(ascending = False)
null_values_test
# Because Product 3 and 2 have missing values, calculate the % of missing values
null_values_train = null_values_train[null_values_train >0]/train.shape[0]
null_values_test = null_values_test[null_values_test >0]/test.shape[0]
with open('missing_values.html', "a") as missing:
print('--------------Train % -----------', file = missing)
print(f'{null_values_train*100} %', file = missing)
print('--------------Test % -----------', file = missing)
print(f'{null_values_test*100} %', file = missing)
The output of the missing values in Product Cat 3 & 2 showed us that Product Cat 3 has 69.6% of missing values and Product Cat 2 has 31.5% of missing values. Hence, we can assume that for all missing values (NA), people actually did not buy those products categories and we will fill those NA values with zeros.
# Convert all null values to 0. It is assumed that the if it is null value, the customer did not buy such product
train = train.fillna(0)
test = test.fillna(0)
train.dtypes
# Lets drop User ID and Product ID
drop_columns = ['User_ID', 'Product_ID']
train.drop(drop_columns, inplace = True, axis =1)
test.drop(drop_columns, inplace = True, axis =1)
test
We are converting features to ordinal/discrete values in order to execute the model.
# Need to convert all objects to numeric. Let's start with Stay_In_Current_City_Years
train['Stay_In_Current_City_Years'].unique()
# # Convert 4+ to 4
train['Stay_In_Current_City_Years'] = train['Stay_In_Current_City_Years'].map({'4+':4, '1':1, '2':2, '3':3,'0':0})
test['Stay_In_Current_City_Years'] = test['Stay_In_Current_City_Years'].map({'4+':4, '1':1, '2':2, '3':3,'0':0})
train['Stay_In_Current_City_Years'].unique()
test
# Need to convert all objects to numeric.
train['City_Category'].unique()
train['City_Category'] = train['City_Category'].map({'A':0, 'B':1, 'C':2})
test['City_Category'] = test['City_Category'].map({'A':0, 'B':1, 'C':2})
train['City_Category'].unique()
# Need to convert all objects to numeric.
train['Age'].unique()
train['Age'] = train['Age'].map({'0-17':0, '18-25':1, '26-35':2, '36-45':3, '46-50':4, '51-55':5, '55+':6})
test['Age'] = test['Age'].map({'0-17':0, '18-25':1, '26-35':2, '36-45':3, '46-50':4, '51-55':5, '55+':6})
train['Age'].unique()
# Need to convert all objects to numeric.
train['Gender'].unique()
train['Gender'] = train['Gender'].map({'F':0, 'M':1})
test['Gender'] = test['Gender'].map({'F':0, 'M':1})
train['Gender'].unique()
The correlation matrix is helping us to understand the strenght of the linear relations between two variables. Ideally we want to understand what is the relation between the dependent variable (Purchase) and all other independent variables. The correlation heatmap shows us that the dependent variable Purchase has correlation with the following independent variables: Marital Status, Age, and Product_Cat 3.
Why only the Product Cat 3 has a linear relation with the dependent variable?
See 3.3
Does this means that these three independent variables have the most influence in the output of the dependent variable?
To be determined
# Create a correlation matrix to find out which independent variable is correlated with the dependent variable
correlation_matrix = train.corr()
fig,ax = plt.subplots(figsize = (12,9))
sns.heatmap(correlation_matrix, vmax=0.8, cmap="coolwarm", square=True, annot = True)
plt.savefig("../images/corr_matrix.png", bbox_inches = "tight")
# Why product category 3 is correlated with dependent variable purchase and the others are not?
cat_1_average = train['Product_Category_1'].mean()
cat_2_average = train['Product_Category_2'].mean()
cat_3_avarage= train['Product_Category_3'].mean()
print(f"PC1: {cat_1_average} \n PC2: {cat_2_average} \n PC3 : {cat_3_avarage} \n Hence we can conclude the correlation of PC3 is due to average price. PC3 has on average the cheaper price")
# Remove category 19 and 20 fron Product Category 1 from Train & Test
removal_train = train.index[(train.Product_Category_1.isin([19,20]))]
train = train.drop(removal_train)
# Remove category 19 and 20 fron Product Category 1 from Train & Test
removal_test = test.index[(test.Product_Category_1.isin([19,20]))]
test = test.drop(removal_test)
test
# Convert Stay_in_current_city_years to binary by using dummy variables
# train = pd.get_dummies(train, columns=['Stay_In_Current_City_Years'])
# test = pd.get_dummies(test, columns=['Stay_In_Current_City_Years'])
# Check data types
train.dtypes
# Save clean file
train.to_csv("../../data/trainCLEAN.csv", index=False, encoding='utf8')
test.to_csv("../../data/testCLEAN.csv", index=False, encoding='utf8')
train.columns
Assing the features to X and y. Where X are the independent variables and y is the target value
X = train[['Gender', 'Age', 'Occupation', 'City_Category',
'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category_1',
'Product_Category_2', 'Product_Category_3']]
y = train['Purchase'].values.reshape(-1, 1)
print(X.shape, y.shape)
# Split the data into training and testing 80% train and 20% test
### BEGIN SOLUTION
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)
### END SOLUTION
https://www.datacamp.com/community/tutorials/tutorial-ridge-lasso-elastic-net
### BEGIN SOLUTION
from sklearn.linear_model import LinearRegression
model = LinearRegression(n_jobs = 1)
model.fit(X_train, y_train)
### END SOLUTION
Run the predictions with .predict(X_test) and plot a residual plot. If the points in the residual plot are randomly disperse around the horizontal axis, a linear regression model is appropriate for the data. Otherwise, a non-linear model is more appropriate
# Make predictions using the X_test data
# Plot y_test vs y_test
# Scatter plot y_tes_scaled vs predictions
### BEGIN SOLUTION
predictions = model.predict(X_test)
plt.scatter(model.predict(X_train), model.predict(X_train) - y_train, c="blue", label="Training Data")
plt.scatter(model.predict(X_test), model.predict(X_test) - y_test, c="orange", label="Testing Data")
plt.legend()
plt.hlines(y=0, xmin=y_test.min(), xmax=y_test.max())
plt.title("Residual Plot")
plt.show()
### END SOLUTION
linear_predictions = model.predict(X_test)
print("Predicted purchases (in dollars) for new costumers:", linear_predictions)
To evaluate the model we will use the MSE (Mean Square Error), RMSE (Root Mean Squared Error) and R2
MSE: Measures the average of the squares of the errors—that is, the average squared difference between the estimated values and the actual value. The MSE is a measure of the quality of an estimator—it is always non-negative, and values closer to zero are better.
RMSE: Root Mean Square Error (RMSE) is the standard deviation of the residuals (prediction errors). Residuals are a measure of how far from the regression line data points are; RMSE is a measure of how spread out these residuals are. In other words, it tells you how concentrated the data is around the line of best fit. RMSE is always non-negative, and a value of 0 (almost never achieved in practice) would indicate a perfect fit to the data. In general, a lower RMSD is better than a higher one. However, comparisons across different types of data would be invalid because the measure is dependent on the scale of the numbers used.
R2: R-squared is a statistical measure of how close the data are to the fitted regression line. It is also known as the coefficient of determination, or the coefficient of multiple determination for multiple regression. In general, the higher the R-squared, the better the model fits your data.
R-squared as a Biased Estimate R-squared measures the strength of the relationship between the predictors and response. The R-squared in your regression output is a biased estimate based on your sample.
R-squared is like the broken bathroom scale: it is deceptively large. Researchers have long recognized that regression’s optimization process takes advantage of chance correlations in the sample data and inflates the R-squared.
This bias is a reason why some practitioners don’t use R-squared at all—it tends to be wrong.
Fortunately, there is a solution and you’re probably already familiar with it: adjusted R-squared. I’ve written about using the adjusted R-squared to compare regression models with a different number of terms.
# Used X_test, y_test, and model.predict(X_test) to calculate MSE and R2
### BEGIN SOLUTION
from sklearn.metrics import mean_squared_error
MSE = mean_squared_error(y_test, predictions)
r2 = model.score(X_test, y_test)
r2_adjusted = 1 - (1-r2)*(len(y)-1)/(len(y)-X.shape[1]-1)
RMSE = np.sqrt(MSE)
### END SOLUTION
print(f"MSE: {MSE}, RMSE: {RMSE}, R2: {r2}, R2_Adjusted: {r2_adjusted}")
coeff_df = pd.DataFrame(model.coef_[0], X_train.columns, columns=['Coefficient'])
coeff_df
ax = coeff_df.plot(kind='bar', title='Variable Coefficients')
ax.grid(zorder=0)
import seaborn as sns
corr = X.corr()
ax = sns.heatmap(
corr,
vmin=-1, vmax=1, center=0,
cmap=sns.diverging_palette(20, 220, n=200),
square=True
)
ax.set_xticklabels(
ax.get_xticklabels(),
rotation=45,
horizontalalignment='right'
);
https://www.statisticshowto.datasciencecentral.com/lasso-regression/
Lasso (least absolute shrinkage and selection operator) regression is a type of linear regression that uses shrinkage. Shrinkage is where data values are shrunk towards a central point, like the mean. The lasso procedure encourages simple, sparse models (i.e. models with fewer parameters). This particular type of regression is well-suited for models showing high levels of muticollinearity or when you want to automate certain parts of model selection, like variable selection/parameter elimination.
Multicollinearity
Multicollinearity generally occurs when there are high correlations between two or more predictor variables. In other words, one predictor variable can be used to predict the other. This creates redundant information, skewing the results in a regression model. Examples of correlated predictor variables (also called multicollinear predictors) are: a person’s height and weight, age and sales price of a car, or years of education and annual income. An easy way to detect multicollinearity is to calculate correlation coefficients for all pairs of predictor variables. If the correlation coefficient, r, is exactly +1 or -1, this is called perfect multicollinearity. If r is close to or exactly -1 or +1, one of the variables should be removed from the model if at all possible.
It’s more common for multicollineariy to rear its ugly head in observational studies; it’s less common with experimental data. When the condition is present, it can result in unstable and unreliable regression estimates. Several other problems can interfere with analysis of results, including:
# LASSO model
# Note: Use an alpha of .01 when creating the model for this activity
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=.01).fit(X_train, y_train)
lasso_predictions = lasso.predict(X_test)
MSE = mean_squared_error(y_test, lasso_predictions)
r2 = lasso.score(X_test, y_test)
RMSE = np.sqrt(MSE)
### END SOLUTION
print(f"MSE: {MSE}, RMSE: {RMSE}, R2: {r2}")
lasso_df = pd.DataFrame(lasso.coef_, X_train.columns, columns=['Coefficient'])
lasso_df
ax = lasso_df.plot(kind='bar', title='Variable Coefficients')
ax.grid(zorder=0)
https://ncss-wpengine.netdna-ssl.com/wp-content/themes/ncss/pdf/Procedures/NCSS/Ridge_Regression.pdf
Ridge Regression is a technique for analyzing multiple regression data that suffer from multicollinearity. When multicollinearity occurs, least squares estimates are unbiased, but their variances are large so they may be far from the true value. By adding a degree of bias to the regression estimates, ridge regression reduces the standard errors. It is hoped that the net effect will be to give estimates that are more reliable.
# Ridge model
# Note: Use an alpha of .01 when creating the model for this activity
from sklearn.linear_model import Ridge
### BEGIN SOLUTION
ridge = Ridge(alpha=.01).fit(X_train, y_train)
ridge_predictions = ridge.predict(X_test)
MSE = mean_squared_error(y_test, ridge_predictions)
r2 = ridge.score(X_test, y_test)
RMSE = np.sqrt(MSE)
### END SOLUTION
print(f"MSE: {MSE}, RMSE: {RMSE}, R2: {r2}")
ridge_df = pd.DataFrame(ridge.coef_[0], X_train.columns, columns=['Coefficient'])
ridge_df
ax = ridge_df.plot(kind='bar', title='Variable Coefficients')
ax.grid(zorder=0)
https://hackernoon.com/an-introduction-to-ridge-lasso-and-elastic-net-regression-cca60b4b934f
In statistics and, in particular, in the fitting of linear or logistic regression models, the elastic net is a regularized regression method that linearly combines the L1 and L2 penalties of the lasso and ridge methods.
# ElasticNet model
# Note: Use an alpha of .01 when creating the model for this activity
from sklearn.linear_model import ElasticNet
### BEGIN SOLUTION
elasticnet = ElasticNet(alpha=.01).fit(X_train, y_train)
elasticnet_predictions = elasticnet.predict(X_test)
MSE = mean_squared_error(y_test, elasticnet_predictions)
RMSE = np.sqrt(MSE)
### END SOLUTION
print(f"MSE: {MSE}, RMSE: {RMSE}, R2: {r2}")
elasticnet_df = pd.DataFrame(elasticnet.coef_, X_train.columns, columns=['Coefficient'])
elasticnet_df
ax = elasticnet_df.plot(kind='bar', title='Variable Coefficients')
ax.grid(zorder=0)
# Read both train and test data
train = pd.read_csv("../../data/train.csv")
test = pd.read_csv("../../data/test.csv")
# Check for missing values in the train and test file
null_values_train = train.isnull().sum().sort_values(ascending = False)
null_values_train
null_values_test = test.isnull().sum().sort_values(ascending = False)
null_values_test
# Convert all null values to 0. It is assumed that the if it is null value, the customer did not buy such product
train = train.fillna(0)
test = test.fillna(0)
# Lets drop User ID and Product ID
drop_columns = ['User_ID', 'Product_ID']
train.drop(drop_columns, inplace = True, axis =1)
# test.drop(drop_columns, inplace = True, axis =1)
# Remove category 19 and 20 fron Product Category 1 from Train
removal = train.index[(train.Product_Category_1.isin([19,20]))]
train = train.drop(removal)
train
train = pd.get_dummies(train)
train.head()
train.columns
X = train[['Occupation', 'Marital_Status', 'Product_Category_1',
'Product_Category_2', 'Product_Category_3', 'Gender_F',
'Gender_M', 'Age_0-17', 'Age_18-25', 'Age_26-35', 'Age_36-45',
'Age_46-50', 'Age_51-55', 'Age_55+', 'City_Category_A',
'City_Category_B', 'City_Category_C', 'Stay_In_Current_City_Years_0',
'Stay_In_Current_City_Years_1', 'Stay_In_Current_City_Years_2',
'Stay_In_Current_City_Years_3', 'Stay_In_Current_City_Years_4+']]
y = train['Purchase'].values.reshape(-1, 1)
print(X.shape, y.shape)
# Split the data into training and testing
### BEGIN SOLUTION
from sklearn.model_selection import train_test_split
from sklearn import metrics
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42) # need to add stratify = y but I get an error
### END SOLUTION
from sklearn.preprocessing import StandardScaler
# Create a StandardScaler model and fit it to the training data
### BEGIN SOLUTION
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)
## END SOLUTION
# Transform the training and testing data using the X_scaler and y_scaler models
### BEGIN SOLUTION
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)
### END SOLUTION
### BEGIN SOLUTION
from sklearn.linear_model import LinearRegression
model = LinearRegression(n_jobs = 1)
model.fit(X_train_scaled, y_train_scaled)
### END SOLUTION
Run the predictions with .predict(X_test) and plot a residual plot. If the points in the residual plot are randomly disperse around the horizontal axis, a linear regression model is appropriate for the data. Otherwise, a non-linear model is more appropriate
# Make predictions using the X_test_scaled data
# Plot y_test_scaled vs y_test_scaled
# Scatter plot y_test_scaled vs predictions
### BEGIN SOLUTION
predictions = model.predict(X_test_scaled)
plt.scatter(model.predict(X_train_scaled), model.predict(X_train_scaled) - y_train_scaled, c="blue", label="Training Data")
plt.scatter(model.predict(X_test_scaled), model.predict(X_test_scaled) - y_test_scaled, c="orange", label="Testing Data")
plt.legend()
plt.hlines(y=0, xmin=y_test_scaled.min(), xmax=y_test_scaled.max())
plt.title("Residual Plot")
plt.show()
### END SOLUTION
To evaluate the model we will use the MSE (Mean Square Error), RMSE (Root Mean Squared Error) and R2
MSE: Measures the average of the squares of the errors—that is, the average squared difference between the estimated values and the actual value. The MSE is a measure of the quality of an estimator—it is always non-negative, and values closer to zero are better.
RMSE: Root Mean Square Error (RMSE) is the standard deviation of the residuals (prediction errors). Residuals are a measure of how far from the regression line data points are; RMSE is a measure of how spread out these residuals are. In other words, it tells you how concentrated the data is around the line of best fit. RMSE is always non-negative, and a value of 0 (almost never achieved in practice) would indicate a perfect fit to the data. In general, a lower RMSD is better than a higher one. However, comparisons across different types of data would be invalid because the measure is dependent on the scale of the numbers used.
R2: R-squared is a statistical measure of how close the data are to the fitted regression line. It is also known as the coefficient of determination, or the coefficient of multiple determination for multiple regression. In general, the higher the R-squared, the better the model fits your data.
R-squared as a Biased Estimate R-squared measures the strength of the relationship between the predictors and response. The R-squared in your regression output is a biased estimate based on your sample.
R-squared is like the broken bathroom scale: it is deceptively large. Researchers have long recognized that regression’s optimization process takes advantage of chance correlations in the sample data and inflates the R-squared.
This bias is a reason why some practitioners don’t use R-squared at all—it tends to be wrong.
Fortunately, there is a solution and you’re probably already familiar with it: adjusted R-squared. I’ve written about using the adjusted R-squared to compare regression models with a different number of terms.
# Used X_test_scaled, y_test_scaled, and model.predict(X_test_scaled) to calculate MSE and R2
### BEGIN SOLUTION
from sklearn.metrics import mean_squared_error
MSE = mean_squared_error(y_test_scaled, predictions)
r2 = model.score(X_test_scaled, y_test_scaled)
RMSE = np.sqrt(MSE)
### END SOLUTION
print(f"MSE: {MSE}, RMSE: {RMSE}, R2: {r2}")
model.coef_[0]
coeff_df = pd.DataFrame(model.coef_[0], X_train.columns, columns=['Coefficient'])
coeff_df
ax = coeff_df.plot(kind='bar', title='Variable Coefficients')
ax.grid(zorder=0)
plt.savefig("../images/regression_coef.png", bbox_inches = "tight")
import seaborn as sns
corr = X.corr()
ax = sns.heatmap(
corr,
vmin=-1, vmax=1, center=0,
cmap=sns.diverging_palette(20, 220, n=200),
square=True
)
ax.set_xticklabels(
ax.get_xticklabels(),
rotation=45,
horizontalalignment='right'
);
https://www.statisticshowto.datasciencecentral.com/lasso-regression/
Lasso (least absolute shrinkage and selection operator) regression is a type of linear regression that uses shrinkage. Shrinkage is where data values are shrunk towards a central point, like the mean. The lasso procedure encourages simple, sparse models (i.e. models with fewer parameters). This particular type of regression is well-suited for models showing high levels of muticollinearity or when you want to automate certain parts of model selection, like variable selection/parameter elimination.
Multicollinearity
Multicollinearity generally occurs when there are high correlations between two or more predictor variables. In other words, one predictor variable can be used to predict the other. This creates redundant information, skewing the results in a regression model. Examples of correlated predictor variables (also called multicollinear predictors) are: a person’s height and weight, age and sales price of a car, or years of education and annual income. An easy way to detect multicollinearity is to calculate correlation coefficients for all pairs of predictor variables. If the correlation coefficient, r, is exactly +1 or -1, this is called perfect multicollinearity. If r is close to or exactly -1 or +1, one of the variables should be removed from the model if at all possible.
It’s more common for multicollineariy to rear its ugly head in observational studies; it’s less common with experimental data. When the condition is present, it can result in unstable and unreliable regression estimates. Several other problems can interfere with analysis of results, including:
# LASSO model
# Note: Use an alpha of .01 when creating the model for this activity
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=.01).fit(X_train_scaled, y_train_scaled)
lasso_predictions = lasso.predict(X_test_scaled)
MSE = mean_squared_error(y_test_scaled, lasso_predictions)
r2 = lasso.score(X_test_scaled, y_test_scaled)
RMSE = np.sqrt(MSE)
### END SOLUTION
print(f"MSE: {MSE}, RMSE: {RMSE}, R2: {r2}")
lasso.coef_
lasso_df = pd.DataFrame(lasso.coef_, X_train.columns, columns=['Coefficient'])
lasso_df
ax = lasso_df.plot(kind='bar', title='Variable Coefficients')
ax.grid(zorder=0)
plt.savefig("../images/lasso_coef.png", bbox_inches = "tight")
https://ncss-wpengine.netdna-ssl.com/wp-content/themes/ncss/pdf/Procedures/NCSS/Ridge_Regression.pdf
Ridge Regression is a technique for analyzing multiple regression data that suffer from multicollinearity. When multicollinearity occurs, least squares estimates are unbiased, but their variances are large so they may be far from the true value. By adding a degree of bias to the regression estimates, ridge regression reduces the standard errors. It is hoped that the net effect will be to give estimates that are more reliable.
# Ridge model
# Note: Use an alpha of .01 when creating the model for this activity
from sklearn.linear_model import Ridge
### BEGIN SOLUTION
ridge = Ridge(alpha=.01).fit(X_train_scaled, y_train_scaled)
ridge_predictions = ridge.predict(X_test_scaled)
MSE = mean_squared_error(y_test_scaled, ridge_predictions)
r2 = ridge.score(X_test_scaled, y_test_scaled)
RMSE = np.sqrt(MSE)
### END SOLUTION
print(f"MSE: {MSE}, RMSE: {RMSE}, R2: {r2}")
ridge_df = pd.DataFrame(ridge.coef_[0], X_train.columns, columns=['Coefficient'])
ridge_df
ax = ridge_df.plot(kind='bar', title='Variable Coefficients')
ax.grid(zorder=0)
plt.savefig("../images/ridge_coef.png", bbox_inches = "tight")
https://hackernoon.com/an-introduction-to-ridge-lasso-and-elastic-net-regression-cca60b4b934f
In statistics and, in particular, in the fitting of linear or logistic regression models, the elastic net is a regularized regression method that linearly combines the L1 and L2 penalties of the lasso and ridge methods.
# ElasticNet model
# Note: Use an alpha of .01 when creating the model for this activity
from sklearn.linear_model import ElasticNet
### BEGIN SOLUTION
elasticnet = ElasticNet(alpha=.01).fit(X_train_scaled, y_train_scaled)
elasticnet_predictions = elasticnet.predict(X_test_scaled)
MSE = mean_squared_error(y_test_scaled, elasticnet_predictions)
r2 = elasticnet.score(X_test_scaled, y_test_scaled)
RMSE = np.sqrt(MSE)
### END SOLUTION
print(f"MSE: {MSE}, RMSE: {RMSE}, R2: {r2}")
elasticnet_df = pd.DataFrame(elasticnet.coef_, X_train.columns, columns=['Coefficient'])
elasticnet_df
ax = elasticnet_df.plot(kind='bar', title='Variable Coefficients')
ax.grid(zorder=0)
plt.savefig("../images/elasticnet_coef.png", bbox_inches = "tight")
https://gdcoder.com/decision-tree-regressor-explained-in-depth/
Decision tree regression observes features of an object and trains a model in the structure of a tree to predict data in the future to produce meaningful continuous output. Continuous output means that the output/result is not discrete, i.e., it is not represented just by a discrete, known set of numbers or values.
Decision tree regressor uses MSE and similar metrics to determine splits.
# Read both train and test data
train = pd.read_csv("../../data/trainCLEAN.csv")
testFinal = pd.read_csv("../../data/testCLEAN.csv")
train.columns
X = train[['Gender', 'Age', 'Occupation', 'City_Category',
'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category_1',
'Product_Category_2', 'Product_Category_3']]
y = train['Purchase'].values.reshape(-1, 1)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)
To evaluate the model we will use the MSE (Mean Square Error), RMSE (Root Mean Squared Error) and R2
MSE: Measures the average of the squares of the errors—that is, the average squared difference between the estimated values and the actual value. The MSE is a measure of the quality of an estimator—it is always non-negative, and values closer to zero are better.
RMSE: Root Mean Square Error (RMSE) is the standard deviation of the residuals (prediction errors). Residuals are a measure of how far from the regression line data points are; RMSE is a measure of how spread out these residuals are. In other words, it tells you how concentrated the data is around the line of best fit. RMSE is always non-negative, and a value of 0 (almost never achieved in practice) would indicate a perfect fit to the data. In general, a lower RMSD is better than a higher one. However, comparisons across different types of data would be invalid because the measure is dependent on the scale of the numbers used.
Regressor_Score: Returns the coefficient of determination R^2 of the prediction.
R2: R-squared is a statistical measure of how close the data are to the fitted regression line. It is also known as the coefficient of determination, or the coefficient of multiple determination for multiple regression. In general, the higher the R-squared, the better the model fits your data.
R-squared as a Biased Estimate R-squared measures the strength of the relationship between the predictors and response. The R-squared in your regression output is a biased estimate based on your sample.
R-squared is like the broken bathroom scale: it is deceptively large. Researchers have long recognized that regression’s optimization process takes advantage of chance correlations in the sample data and inflates the R-squared.
This bias is a reason why some practitioners don’t use R-squared at all—it tends to be wrong.
Fortunately, there is a solution and you’re probably already familiar with it: adjusted R-squared. I’ve written about using the adjusted R-squared to compare regression models with a different number of terms.
# https://stackoverflow.com/questions/46139186/interpreting-the-decisiontreeregressor-score
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import metrics
regressor_model = DecisionTreeRegressor(max_depth=8, min_samples_leaf=150)
regressor_model = regressor_model.fit(X_train, y_train)
regressor_predictions = regressor_model.predict(X_test)
regressor_score = regressor_model.score(X_train,y_train)
print('Purchase Predictions:', regressor_predictions)
print('MAE:', metrics.mean_absolute_error(y_test, regressor_predictions))
print('MSE:', metrics.mean_squared_error(y_test, regressor_predictions))
print('RMSE', np.sqrt(metrics.mean_squared_error(y_test, regressor_predictions)))
print(f"Model Accuracy (score):{regressor_score*100}%")
# print('Test Variance score: %.2f' % r2_score(y_test, regressor_predictions))
from sklearn.model_selection import cross_val_score
R2_coefficient_determination = cross_val_score(regressor_model, X_train, y_train, cv=10)
R2_coefficient_determination
print('Coefficient of Determination (a.k.a R2 in LR):', R2_coefficient_determination)
# DecisionTree in sklearn will automatically calculate feature importance
importances = regressor_model.feature_importances_
importances
# We can sort the features by order of importance
regressor_coef = sorted(zip(regressor_model.feature_importances_, train.columns), reverse=True)
regressor_coef = pd.DataFrame(regressor_coef)
regressor_coef.columns = ['Coefficients', 'Features']
# Save to csv and HTML
regressor_coef.to_csv('../../data/regressor_coef.html')
regressor_coef.to_html('../../data/regressor_coef.html')
regressor_coef
https://medium.com/@rnbrown/creating-and-visualizing-decision-trees-with-python-f8e8fa394176
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(regressor_model, out_file=dot_data,
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
# https://acadgild.com/blog/using-decision-trees-for-regression-problems
fig, ax = plt.subplots()
ax.scatter(y_test, regressor_predictions, edgecolors='white')
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
ax.set_xlabel('Actual')
ax.set_ylabel('Predicted')
plt.show()
plt.savefig("../images/regressor_plot.png", bbox_inches = "tight")
# source: https://stackoverflow.com/questions/42049147/convert-list-to-pandas-dataframe-column
predicted = pd.DataFrame(np.array([regressor_predictions]).T)
actual = pd.DataFrame(y_test)
results = pd.merge(actual, predicted, left_index=True, right_index=True)
results.rename(columns = {'0_x':'Actual_Purchase', '0_y': 'Predicted_Purchase'}, inplace = True)
results
# Let's predict the original test file
test_predict = regressor_model.predict(testFinal)
test_predict
# Put output in a dataframe
test_predict_output = pd.DataFrame(np.array([test_predict]).T)
test_predict_output['Test_Purchase'] = test_predict_output
test_predict_output.drop(test_predict_output.columns[0], inplace = True, axis =1)
test_predict_output
# Create Table of Test file with results
table = pd.merge(test[['User_ID', 'Product_ID']], testFinal, left_index = True, right_index = True)
testTableFinal = pd.merge(table, test_predict_output, left_index = True, right_index = True)
#Save to CSV
testTableFinal.to_csv("../../data/testTableFinalCLEAN.csv", index=False, encoding='utf8')
# Save to HTML
testTableFinal.to_html('../../data/testTableFinalCLEAN.html')
testTableFinal
# Save to Json
testTableFinal.to_json("../../data/testTableFinalCLEAN.json", orient='records')
testTableFinal
# Save 100 records to a table
testTableFinal1 = testTableFinal[:50] # same as df.head(50)
testTableFinal2 = testTableFinal[-50:] # same as df.tail(20)
testTableFinalAppend = testTableFinal1.append(testTableFinal2, ignore_index = True)
testTableFinalAppend
#Save to CSV
testTableFinalAppend.to_csv("../../data/testTableFinalCLEAN100.csv", index=False, encoding='utf8')
# Save to HTML
testTableFinalAppend.to_html('../../data/testTableFinalCLEAN100.html')
testTableFinal
# Save to Json
testTableFinalAppend.to_json("../../data/testTableFinalCLEAN100.json", orient='records')
testTableFinalAppend
# Create table to input in competition
test_predict_output = pd.merge(test[['User_ID', 'Product_ID']], test_predict_output, left_index = True, right_index = True)
test_predict_output
test_predict_output.info()
test_predict_output['UserID_and_ProductID'] = test_predict_output["User_ID"].map(str) + test_predict_output["Product_ID"].map(str)
test_predict_output
# Lets drop User ID and Product ID
drop_columns = ['User_ID', 'Product_ID']
test_predict_output.drop(drop_columns, inplace = True, axis =1)
test_predict_output
# Combine UserID and ProductID into one column and export to csv
test_predict_output = test_predict_output[['UserID_and_ProductID', 'Test_Purchase']]
#Save to CSV
test_predict_output.to_csv("../../data/testOutputCLEAN.csv", index=False, encoding='utf8')
test_predict_output